This is not Bayesian optimisation, but a test of Gaussian process regression in general to observe what situations are challenging for these models to cope with. My findings are as follows:
In [ ]:
%load_ext autoreload
%autoreload 2
In [ ]:
from IPython.display import display
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
#import seaborn as sns; sns.set() # prettify matplotlib
import time
import numpy as np
import sklearn.gaussian_process as gp
import GPy
In [ ]:
import turbo.gui.jupyter as tg
tg.jupyter_set_width('80%')
In [ ]:
# make deterministic
np.random.seed(42)
In [ ]:
f = lambda x: 1 * x * np.cos(x)
f = lambda x: 100 * np.sin(x**2/5) * np.cos(x*1.5) + 100
xmin, xmax = 0, 12
xs = np.linspace(xmin, xmax, num=200)
n = 8 # 8 is a good value to show overfitting
#n = 200
X = np.random.uniform(xmin, xmax, size=(n,1))
y = f(X) + np.random.normal(0, 5, size=(n,1))
ys = f(xs)
best_y = np.min(ys)
best_x = xs[np.argmin(ys)]
In [ ]:
plt.figure(figsize=(12, 4))
plt.plot(xs, ys, 'g-', label='objective')
plt.plot(X, y, 'ko', markersize=4, label='sampled points')
plt.legend(loc='upper left')
plt.margins(0.01, 0.1)
plt.xlabel(r'$x$')
plt.ylabel(r'$f(x)$')
plt.show()
The function has been chosen to be particularly difficult for a GP to fit to with the following difficulties:
In [ ]:
def plot_GP(mean, sig, title=''):
mean = mean.flatten()
sig = sig.flatten()
plt.figure(figsize=(20,10))
plt.title(title)
plt.plot(xs, ys, '--', color='grey', label='true function')
plt.plot(X, y, 'bx', markersize=7, label='training samples')
plt.plot(xs, mean, label='GP prediction')
plt.fill_between(xs, mean-sig, mean+sig, label='GP prediction', alpha=0.2)
plt.legend()
plt.show()
In [ ]:
def plot_GPs(means, sigs, title=''):
plt.figure(figsize=(20,10))
plt.title(title)
plt.plot(xs, ys, '--', color='grey', label='true function')
plt.plot(X, y, 'bx', markersize=7, label='training samples')
for i in range(means.shape[0]):
mean = means[i]
sig = sigs[i]
plt.plot(xs, mean, color='b')
plt.fill_between(xs, mean-sig, mean+sig, color='b', alpha=0.05)
plt.legend()
plt.show()
In [ ]:
kernel = GPy.kern.RBF(input_dim=1)
gp_model = GPy.models.GPRegression(X, y, kernel=kernel)
gp_model.optimize_restarts(messages=False, num_restarts=10);
display(gp_model)
mean, var = gp_model.predict(xs.reshape(-1,1))
plot_GP(mean, np.sqrt(var), 'GPy 10 restarts RBF kernel')
In [ ]:
kernel = GPy.kern.RBF(input_dim=1) + GPy.kern.Bias(input_dim=1)
gp_model = GPy.models.GPRegression(X, y, kernel=kernel)
gp_model.optimize_restarts(messages=False, num_restarts=10);
display(gp_model)
mean, var = gp_model.predict(xs.reshape(-1,1))
plot_GP(mean, np.sqrt(var), 'GPy 10 restarts RBF kernel + bias kernel')
In [ ]:
kernel = GPy.kern.RBF(input_dim=1)
gp_model = GPy.models.GPRegression(X, y, kernel=kernel, normalizer=True)
gp_model.optimize_restarts(messages=False, num_restarts=10);
display(gp_model)
mean, var = gp_model.predict(xs.reshape(-1,1))
plot_GP(mean, np.sqrt(var), 'GPy 10 restarts RBF kernel with normalised Y')
In [ ]:
kernel = GPy.kern.RBF(input_dim=1) + GPy.kern.White(input_dim=1)
# Note: using a white kernel instead of noise_var so that the priors can be specified completely with the kernel rather than requiring access to the model
#kernel.rbf.variance.set_prior(GPy.priors.Gamma.from_EV(0.2, 0.1))
kernel.rbf.lengthscale.set_prior(GPy.priors.Gamma.from_EV(0.5, 0.1))
kernel.white.variance.set_prior(GPy.priors.Gamma.from_EV(0.8, 0.4))
gp_model = GPy.models.GPRegression(X, y, kernel=kernel, normalizer=True, noise_var=1e-10)
gp_model.optimize_restarts(messages=False, num_restarts=10);
display(gp_model)
mean, var = gp_model.predict(xs.reshape(-1,1))
plot_GP(mean, np.sqrt(var), 'GPy 10 restarts RBF kernel with normalised Y with parameter priors')
In [ ]:
kernel = GPy.kern.RBF(input_dim=1)
gp_model = GPy.models.SparseGPRegression(X, y, kernel=kernel, normalizer=True)
gp_model.optimize_restarts(messages=False, num_restarts=10);
display(gp_model)
gp_model.plot()
mean, var = gp_model.predict(xs.reshape(-1,1))
plot_GP(mean, np.sqrt(var), 'GPy sparse GP 10 restarts RBF kernel with normalised Y')
In [ ]:
kernel = GPy.kern.RBF(input_dim=1)
gp_model = GPy.models.GPRegression(X, y, kernel=kernel, normalizer=True)
#gp_model.optimize_restarts(messages=False, num_restarts=10);
# from expected value and variance
gp_model.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 1.0))
gp_model.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.0, 1.0))
gp_model.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.5, 1))
start = time.time()
hmc = GPy.inference.mcmc.HMC(gp_model)#, stepsize=5e-2)
print('burnin')
s = hmc.sample(num_samples=500) # Burnin
print('burnin took: {} seconds'.format(time.time()-start))
start = time.time()
print('sampling')
samples = 200
s = hmc.sample(num_samples=samples)
print('sampling took: {} seconds'.format(time.time()-start))
display(gp_model)
means = np.empty((samples, xs.shape[0]))
sigs = np.empty((samples, xs.shape[0]))
for i, params in enumerate(s):
mean, var = gp_model.predict(xs.reshape(-1,1))
means[i,:] = mean.flatten()
sigs[i,:] = np.sqrt(var).flatten()
mean = np.mean(means, axis=0)
sig = np.mean(sigs, axis=0)
plot_GP(mean, sig, 'GPy MCMC (averaged predictions) RBF kernel with normalised Y')
#plot_GPs(means, sigs, 'GPy MCMC RBF kernel with normalised Y')
In [ ]:
gp_model.parameters
In [ ]:
gp_model[:] = np.mean(s, axis=0)
mean, var = gp_model.predict(xs.reshape(-1,1))
plot_GP(mean, np.sqrt(var), 'GPy MCMC (averaged parameters) RBF kernel with normalised Y')
In [ ]:
def plot_samples():
labels = ['kern variance', 'kern lengthscale','noise variance']
from scipy import stats
xmin = s.min()
xmax = s.max()
xs = np.linspace(xmin,xmax,100)
for i in range(s.shape[1]):
kernel = stats.gaussian_kde(s[:,i])
plt.plot(xs,kernel(xs),label=labels[i])
_ = plt.legend()
plot_samples()
In [ ]:
def plot_heatmap():
labels = ['kern variance', 'kern lengthscale','noise variance']
fig = plt.figure(figsize=(14,4))
ax = fig.add_subplot(131)
_=ax.plot(s[:,0],s[:,1],'.')
ax.set_xlabel(labels[0]); ax.set_ylabel(labels[1])
ax = fig.add_subplot(132)
_=ax.plot(s[:,1],s[:,2],'.')
ax.set_xlabel(labels[1]); ax.set_ylabel(labels[2])
ax = fig.add_subplot(133)
_=ax.plot(s[:,0],s[:,2],'.')
ax.set_xlabel(labels[0]); ax.set_ylabel(labels[2])
plt.show()
plot_heatmap()
In [ ]:
kernel = gp.kernels.RBF()
gp_model = gp.GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
gp_model.fit(X, y);
mean, sig = gp_model.predict(xs.reshape(-1,1), return_std=True)
plot_GP(mean, sig, 'scikit 10 restarts RBF kernel')
In [ ]:
kernel = gp.kernels.RBF() + gp.kernels.WhiteKernel()
gp_model = gp.GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
gp_model.fit(X, y);
mean, sig = gp_model.predict(xs.reshape(-1,1), return_std=True)
plot_GP(mean, sig, 'scikit 10 restarts RBF kernel + white kernel')
In [ ]:
kernel = gp.kernels.RBF() + gp.kernels.WhiteKernel() + 1.0
gp_model = gp.GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
gp_model.fit(X, y);
mean, sig = gp_model.predict(xs.reshape(-1,1), return_std=True)
plot_GP(mean, sig, 'scikit 10 restarts RBF kernel + white kernel + bias kernel')
In [ ]:
kernel = 1.0 * gp.kernels.RBF() + gp.kernels.WhiteKernel()
gp_model = gp.GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
gp_model.fit(X, y);
mean, sig = gp_model.predict(xs.reshape(-1,1), return_std=True)
plot_GP(mean, sig, 'scikit 10 restarts constant * RBF kernel + white kernel')
In [ ]:
kernel = 1.0 * gp.kernels.RBF() + gp.kernels.WhiteKernel() + 1.0
gp_model = gp.GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
gp_model.fit(X, y);
mean, sig = gp_model.predict(xs.reshape(-1,1), return_std=True)
plot_GP(mean, sig, 'scikit 10 restarts constant * RBF kernel + white kernel + bias')
In [ ]:
kernel = 1.0 * gp.kernels.RBF() + gp.kernels.WhiteKernel()
gp_model = gp.GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, normalize_y=True)
gp_model.fit(X, y);
mean, sig = gp_model.predict(xs.reshape(-1,1), return_std=True)
plot_GP(mean, sig, 'scikit 10 restarts constant * RBF kernel + white kernel with normalised y')
In [ ]: